// Purpose: Data Cleaning for STUDENT SURVEY
// Last modified: 05 March 2019
// Coding 
	*m_var : missing cases
	*i_var : inconsistencies (ranges,internal logic, etc.)
	*s_var : skiping patterns 
	*o_var : outliers -1.5*IQR<var<1.5*IQR
	*u_var : un-categorized discrepancies

// PREPARATION

clear all
set more off 
cap log close 

	*Edit the directories here before running the do file
	

global raw_dir "[directory]/Raw_data"

cd "$raw_dir"

use "appended_full clean_updated.dta"


// GENERAL CLEANING


*1.IDENTIFICATION OF SCHOOL


	*100 province
g m_100=mi(province_201)
g i_100=!inlist(province_201,1,2,3,4,5)

	*101 district
g m_101=mi(district_201)
g i_101=!inlist(district_201,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30)
	
	*102 sector 
g m_102=mi(sector_201)

	*103 cell
g m_103=mi(cell_201)
	
	*104 village 
g m_104=mi(village_201)


	
	*107Group //treat vs. control	
g  m_107=mi(group_107)
g  i_107=!inlist(group_107,1,2)


*2.STUDENT IDENTIFICATION 

	*Location201  //rural vs. urban location
g m_201loc=mi(loc_201)
g i_201loc=!inlist(loc_201,1,2)

	*201 family rent
g m_201hou=mi(house_201)
g i_201hou=!inlist(house_201,1,2,3)

	*202 Part of boarding student 
g m_202=mi(board_day_202)
g i_202=!inlist(board_day_202,1,2)

	*203How far is home 
g m_203=mi(dist_min_203)

	*204means of transport 
g m_204=mi(trans_204)	
g i_204=!inlist(trans_204,1,2,3,4)


*3. SOCIO ECONOMIC BACKGROUND CHARACTERISTICS


	*300 age 
g m_300=mi(age_300)
g o_300=age_300>20|age_300<15

	*301 gender
g m_301=mi(gender_301)
g i_301=!inlist(gender_301,1,2)

	*302Have parents
g m_302=mi(parents_302)
g i_302=!inlist(parents_302,1,2,3,4)

	*303Main mat of house

g m_303=mi(floor_mat_303)
g i_303=!inlist(floor_mat_303,1,2,3,4,5,-66)

	*304Main mat of the roof 
g m_304=mi(roof_mat_304)

	*305where does your hh
g m_305=mi(water_305)
replace water_305_other="" if water_305_other=="."

	*311 Does any member own a business

g m_311=mi(business_311)
g i_311=!inlist(business_311,0,1)

	
	*4. ACADEMIC BACKGROUND AND PERFORMANCE 


	*403 Times you have repeated

g o_403_4=reptimes4_403<0|reptimes4_403>5
g o_403_5=reptimes5_403<0|reptimes5_403>5
g o_403_6=reptimes6_403<0|reptimes6_403>5


	*404 Aggregate mark in last prom exam
g m_404=mi(aggr_404)
g o_404=aggr_404<0|aggr_404>90|aggr_404<10
* Add a note on this, the closer you're to 0, the better you're 
	

	*5.Income, Business and employment
g m_500=mi(earn_500e)
g i_500=!inlist(earn_500e,1,2,3,4)


if earn_500e!=4{

	*503

g m_503=mi(amt_503)
g i_503=(amt_503<=0|amt_503>50000) & amt_503!=.

g m_type_busi_a=mi(type_busi_402_a_)
g m_type_busi_b=mi(type_busi_402_b_)
g m_type_busi_c=mi(type_busi_402_c_)
g m_type_busi_d=mi(type_busi_402_d_)
g m_type_busi_403=mi(mosttime_busi_403)

}

	*7.Student Business Club activity
	
	*700
if busiclub_700==1 {

	*701
g m_701_1=mi(whosupp_701_1)
g m_701_2=mi(whosupp_701_2)
g m_701_3=mi(whosupp_701_3)
g m_701_4=mi(whosupp_701_4)
g m_701_5=mi(whosupp_701_5)
g m_701_6=mi(whosupp_701_6)
g m_701_7=mi(whosupp_701_7)

}

	*8.ENTREPRENEURSHIP SKILLS AND PERSONAL FINANCE

	*800
g m_800=mi(borrow_800)	

	*801
g m_801=mi( moneyoffer_801)
g i_801=!inlist(moneyoffer_801,1,2,-99)

	*802
g m_802=mi( moneyoffer_802)
g i_802=!inlist(moneyoffer_802,1,2,-99)

	*803
g m_803=mi(savings_803)
g i_803=!inlist(savings_803,1,2,3,-99)


	*9 YOUTH SKILL DEVELOPMENT SCALE
	

	*901
g m_901=mi(busplan_901)

g i_903 = 0
/*foreach var in incrproft_1003_SellMorePaper incrproft_1003_IncreasPrice incrproft_1003_EmploySone incrproft_1003_UseCheaperMat incrproft_903__66 {
    
	replace i_903 =1 if `var'==1 & incrproft_903__55=1
	}
*/	

g i_1004 = 0
/*foreach var in growthindic_904_1 growthindic_904_2 growthindic_904_3 growthindic_904_4 growthindic_904_5 growthindic_904__66 {
    
	replace i_1004 =1 if `var'==1 & incrproft_903__99=1
	}
*/
	
	*11. ASPIRATIONS 
	
	*1100

g m_1100=mi(schooling_1100)
g i_1100=!inlist(schooling_1100,1,2,3,4,5,6,-99)

	*1102
g m_1102=mi(busnsftr_1102)
g i_1102=!inlist(busnsftr_1102,1,2,3,-99)


	*12 ABOUT YOUR LIFE
	
	*1200
g m_1200=mi(univ_1200)
g i_1200=!inlist(univ_1200,1,2,3,4,5,6,7,8,9,10,-99,-55)

	*1201
g m_1201=mi(reas_1201)
g i_1201=!inlist(reas_1201,1,2,3,4,5,6)

	*1202
g m_1202=mi(hswork_1202)
g i_1202=!inlist(hswork_1202,1,2,3,4,5,6,7,8,9,10,-99,-55)

	*1203
g m_1203=mi(reas_1203)
g i_1203=!inlist(reas_1203,1,2,3)

	*1204
g m_1204=mi(child_1204)
g i_1204=!inlist(child_1204,1,2,3,4,5,6,7,8,9,10,-99,-55)


	*1205
g m_1205=mi(reas_1205)
g i_1205=!inlist(reas_1205,1,2,3,4)

	*1206
g m_1206=mi(head_1206)
g i_1206=!inlist(head_1206,1,2,3,4,5,6,7,8,9,10,-99,-55)


	*1207
g m_1207=mi(reas_1207)
g i_1207=!inlist(reas_1207,1,2,3,4)

	*1208
g m_1208=mi(safe_1208)
g i_1208=!inlist(safe_1208,1,2,3,4,5,6,7,8,9,10,-99,-55)
	
	*1209
g m_1209=mi(reas_1209)
g i_1209=!inlist(reas_1209,1,2)


	*13. YOUR INTERESTS

	*1300
g m_1300=mi(ideasproj_1300)
g i_1300=!inlist(ideasproj_1300,1,2,3,4,5,-66)

	*1301
g m_1301=mi(diffintrest_1301)
g i_1301=!inlist(diffintrest_1301,1,2,3,4,5,-66)

	*1302
g m_1302=mi(passchang_1302)
g i_1302=!inlist(passchang_1302,1,2,3,4,5,-66)

	*1303
g m_1303=mi(newproj_1303)
g i_1303=!inlist(newproj_1303,1,2,3,4,5,-66)


	*14 YOUR IDEAS
	
	// RELEASING ISSUES
	
order submissiondate starttime endtime visit_date_112, after(time_interview)

gen all_miss=0
ds m_*
foreach var in `r(varlist)'{
replace all_miss=1 if `var'==1

}
gen all_incoh=0
ds i_*
foreach var in `r(varlist)'{
replace all_incoh=1 if `var'==1
}
gen all_ouli=0
ds o_*
foreach var in `r(varlist)'{
replace all_ouli=1 if `var'==1
}

***Displaying all discrepencies

g all_discr=0

foreach var in all_miss all_incoh all_ouli all_discr {
replace all_discr=1 if `var'==1
}

sort all_discr
count if all_discr==1

noi di "The number of questionnaires with issues is" " " `r(N)'


save "[directory]/appended_full clean_wcleandofilerun.dta", replace

duplicates tag studentid_202, gen(duplicate)

sort district_201 schoolid school_name studentid_202
keep district_201 schoolid school_name duplicate studentid_202 m_* o_* i_* 

outsheet using "Student_ListofIssues.csv", comma replace

